{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3-寻找最优参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
      "  DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn import svm\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn import metrics\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scipy.io as sio"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'yval', 'Xval'])\n"
     ]
    }
   ],
   "source": [
    "mat = sio.loadmat('./data/ex6data3.mat')\n",
    "print(mat.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "training = pd.DataFrame(mat.get('X'), columns=['X1', 'X2'])\n",
    "training['y'] = mat.get('y')\n",
    "\n",
    "cv = pd.DataFrame(mat.get('Xval'), columns=['X1', 'X2'])\n",
    "cv['y'] = mat.get('yval')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(211, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.158986</td>\n",
       "      <td>0.423977</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.347926</td>\n",
       "      <td>0.470760</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.504608</td>\n",
       "      <td>0.353801</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.596774</td>\n",
       "      <td>0.114035</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.518433</td>\n",
       "      <td>-0.172515</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         X1        X2  y\n",
       "0 -0.158986  0.423977  1\n",
       "1 -0.347926  0.470760  1\n",
       "2 -0.504608  0.353801  1\n",
       "3 -0.596774  0.114035  1\n",
       "4 -0.518433 -0.172515  1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(training.shape)\n",
    "training.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(200, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>X1</th>\n",
       "      <th>X2</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.353062</td>\n",
       "      <td>-0.673902</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.227126</td>\n",
       "      <td>0.447320</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.092898</td>\n",
       "      <td>-0.753524</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.148243</td>\n",
       "      <td>-0.718473</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.001512</td>\n",
       "      <td>0.162928</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         X1        X2  y\n",
       "0 -0.353062 -0.673902  0\n",
       "1 -0.227126  0.447320  1\n",
       "2  0.092898 -0.753524  0\n",
       "3  0.148243 -0.718473  0\n",
       "4 -0.001512  0.162928  0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(cv.shape)\n",
    "cv.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# manual grid search for $C$ and $\\sigma$\n",
    "http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "candidate = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "81"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# gamma to comply with sklearn parameter name\n",
    "combination = [(C, gamma) for C in candidate for gamma in candidate]\n",
    "len(combination)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "search = []\n",
    "\n",
    "for C, gamma in combination:\n",
    "    svc = svm.SVC(C=C, gamma=gamma)\n",
    "    svc.fit(training[['X1', 'X2']], training['y'])\n",
    "    search.append(svc.score(cv[['X1', 'X2']], cv['y']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.965 (0.3, 100)\n"
     ]
    }
   ],
   "source": [
    "best_score = search[np.argmax(search)]\n",
    "best_param = combination[np.argmax(search)]\n",
    "\n",
    "print(best_score, best_param)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.92      0.96      0.94       113\n",
      "          1       0.94      0.89      0.91        87\n",
      "\n",
      "avg / total       0.93      0.93      0.92       200\n",
      "\n"
     ]
    }
   ],
   "source": [
    "best_svc = svm.SVC(C=100, gamma=0.3)\n",
    "best_svc.fit(training[['X1', 'X2']], training['y'])\n",
    "ypred = best_svc.predict(cv[['X1', 'X2']])\n",
    "\n",
    "print(metrics.classification_report(cv['y'], ypred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# sklearn `GridSearchCV` （网格搜索）\n",
    "http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=None, error_score='raise',\n",
       "       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False),\n",
       "       fit_params={}, iid=True, n_jobs=-1,\n",
       "       param_grid={'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100], 'gamma': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameters = {'C': candidate, 'gamma': candidate}\n",
    "svc = svm.SVC()\n",
    "clf = GridSearchCV(svc, parameters, n_jobs=-1)\n",
    "clf.fit(training[['X1', 'X2']], training['y'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 10, 'gamma': 30}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9004739336492891"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.95      0.96      0.96       113\n",
      "          1       0.95      0.93      0.94        87\n",
      "\n",
      "avg / total       0.95      0.95      0.95       200\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ypred = clf.predict(cv[['X1', 'X2']])\n",
    "print(metrics.classification_report(cv['y'], ypred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    ">curiouly... they are not the same result. What?  \n",
    "\n",
    "So the built in sklearn grid search is trying to find the best candidate from **training set**  \n",
    "However, when we were doing manual grid search, we train using training set, but we pick the best from **cross validation set**. This is the reason of difference.\n",
    "\n",
    "### I was wrong. That is not the reason\n",
    "It turns out that **GridSearch** will appropriate part of data as CV and use it to find the best candidate.  \n",
    "So the reason for different result is just that GridSearch here is just using part of **training data** to train because it need part of data as cv set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
